This notebook is about classifying Challenge data according to observation type, and some cleanup of inconsistent terms. This notebook records some data operations on Soil Carbon Challenge data to prepare it for insertion in atlasbiowork database. The munging and cleanup process will benefit from using both Excel and pandas for inspection and cleanup.
In [2]:
import pandas as pd
#pd.set_option('mode.sim_interactive', True)
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import json, csv
import re
In [10]:
df = pd.read_csv('/Users/Peter/Documents/atlas/atlasdata/data.xls.csv', encoding='latin1')
df = df.replace(np.nan,' ', regex=True)#MUST REPLACE NAN AS NAN IS NOT ITERABLE
#df.dtypes
In [13]:
df = pd.read_csv('/Users/Peter/Documents/scc/challenge/obs_types/change.csv')
In [94]:
#GET STRINGS OF LABELS "list_of_terms"
a1 = df.label1.unique()
a2 = df.label2.unique()
a3 = df.label3.unique()
#a4 = df.label4.unique()
#a5 = df.label5.unique()
list_of_terms = np.concatenate([a1,a2,a3])
#sorted(list_of_terms)
In [4]:
#SEARCH AND REPLACE
#searchterm = 'lichen'
#replaceterm = 'squish'
#the_list = [item for item in list_of_terms if searchterm in str(item)]
#need to target specific columns
mycols=df[['label1', 'label2','label3','label4','label5']]
#mycols.replace('=','wox', regex=True)
#mycols.replace(to_replace=the_list, value=replaceterm, inplace=True)
In [12]:
#df = df.replace(np.nan,' ', regex=True)
df.label4[df.label4.str.contains('litter')]
Out[12]:
In [16]:
searchterm = 'lichen'
replaceterm = 'moss/algae/lichen'
the_list = [item for item in list_of_terms if searchterm in str(item)]
#need to target specific columns
mycols=df[['label1', 'label2','label3','label4','label5']]
mycols.describe()
#mycols.replace(to_replace=the_list, value=replaceterm, inplace=True)
Out[16]:
In [6]:
searchterm = 'lichen'
replaceterm = 'moss/algae/lichen'
the_list = [item for item in list_of_terms if searchterm in str(item)]
#need to target specific columns
df[['label1', 'label2','label3','label4','label5']].replace(to_replace=thelist, value=replaceterm, inplace=True)
Out[6]:
In [29]:
transects.to_csv('/Users/Peter/Documents/scc/challenge/PSQL/transectsOct23.csv', index=False)
In [33]:
linephotos = df[(df.type.str.contains('line'))]
angphotos = df[(df.type.str.contains('ang')) | (df.note.str.contains('step back'))]
vertphotos = df[df.type.str.contains('vert')]
len(vertphotos)
#re.findall('\d+', s) #finds digits in s
def get_num(x):
digits = ''.join(ele for ele in x if ele.isdigit())
if digits:
return int(digits)
pass
#get_num('Hoop 1, 125\'')
#df.ix[459]
for y in range(len(df)):
Out[33]:
In [63]:
#basic row selection from http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/
#which has GREAT info on joins
peter_plots = df[(df.lat > 0) & (df.observer.str.contains('Peter Donovan'))]
features = df[(df.type == '*plot summary')|(df.type == 'change')|(df.type.str.contains('remonitor'))];
#df.iloc[100] and df.ix[100] get the row referred to by the default 0-based index
# df.loc['Kellogg LTER'] doesn't work because it's not an index;
# dfnew = df.set_index('id'); this works even tho id is not unique
#dfnew.loc['Kellogg LTER'] and this works; use inplace=True as arg to modify existing df
# dfnew.loc['BURR1'] returns all rows for this index
#column selector
#df[['type','label3']]; need to use double [] to enclose a list
#new column
#df['new'] = df.lat + 2
In [60]:
#df['featureID'] = df.id.str[0:6] #str function slices string
#df.type #although type is a keyword this works